{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": 3
  },
  "orig_nbformat": 2
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import sklearn \n",
    "#as scikit_learn\n",
    "import scipy\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.manifold import TSNE\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#path='USER_DIR'\n",
    "#WY\n",
    "#fileName = \"deepWalk_wy_12t9yd_ins_total_2\" #DONE\n",
    "fileName = \"deepWalk_wy_12t9yd_outs_total\" #DONE\n",
    "#fileName = \"deepWalk_wy_13AM_ins_total_2\" #DONE\n",
    "#fileName = \"deepWalk_wy_13AM_outs_total\" #DONE\n",
    "#fileName = \"deepWalk_wy_115p_ins_total_2\" #DONE\n",
    "#fileName = \"deepWalk_wy_115p_outs_total\" #DONE\n",
    "\n",
    "#CD\n",
    "#fileName = \"deepWalk_cd_19Dy_ins_total_4\" #DONE\n",
    "#fileName = \"deepWalk_cd_19Dy_outs_total_5\"\n",
    "#fileName = \"deepWalk_cd_1EmL_ins_total_2\" #DONE\n",
    "#fileName = \"deepWalk_cd_1EmL_outs_total\" #DONE\n",
    "\n",
    "#NP\n",
    "#fileName = \"deepWalk_NP_1Mz_ins_total_2\" #DONE\n",
    "#fileName = \"deepWalk_NP_1Mz_outs_total\"\n",
    "\n",
    "#Control\n",
    "#fileName = \"deepWalk_control_1Hes_ins_total\"\n",
    "#fileName = \"deepWalk_control_1Hes_outs_total\"\n",
    "\n",
    "d = pd.read_csv('USER_DIR'+fileName+'.csv')\n",
    "d = d.set_index(\"n.index\")\n",
    "nd = d['n.deepWalk_128'].str.strip('[]').str.split(',').apply(pd.Series)\n",
    "\n",
    "X=nd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Principle Components Analysis\n",
    "#Reduce the 128 embedding dimensions to 2 dimensions for analysis\n",
    "\n",
    "pca = PCA(n_components=2)\n",
    "\n",
    "X_red = pca.fit_transform(X)\n",
    "\n",
    "# Save components to a DataFrame\n",
    "PCA_components = pd.DataFrame(X_red)\n",
    "plt.figure(figsize=(9 ,6))\n",
    "\n",
    "#Plot the PCA to determine if there are any clear clusters emerging\n",
    "plt.scatter(PCA_components[0], PCA_components[1], alpha=.1, color='black')\n",
    "plt.xlabel('PCA 1')\n",
    "plt.ylabel('PCA 2')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Elbow Method\n",
    "Determining the ideal number of clusters for our k-means model can be done by measuring\n",
    "the sum of the squared distances to the nearest cluster center aka inertia.\n",
    "\"\"\"\n",
    "\n",
    "def elbow_fn(X_red):\n",
    "    ks = range(1, 10)\n",
    "    inertias = []\n",
    "    for k in ks:\n",
    "        # Create a KMeans instance with k clusters: model\n",
    "        model = KMeans(n_clusters=k)\n",
    "        \n",
    "        # Fit model to samples\n",
    "        model.fit(X_red)\n",
    "        \n",
    "        # Append the inertia to the list of inertias\n",
    "        inertias.append(model.inertia_)\n",
    "        \n",
    "    plt.plot(ks, inertias, '-o', color='black')\n",
    "    plt.xlabel('number of clusters, k')\n",
    "    plt.ylabel('inertia')\n",
    "    plt.xticks(ks)\n",
    "    plt.show()\n",
    "    return\n",
    "\n",
    "#Run Elbow function to determine the number of clusters (K) for analysis \n",
    "elbow_fn(X_red)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#From the elbow function determine Kmeans clustering coefficient\n",
    "#That is where does the elbow occur in the curve\n",
    "K=4\n",
    "kmeans = KMeans(n_clusters=K, random_state=0).fit(X_red)\n",
    "\n",
    "#Cluster labels\n",
    "lab = kmeans.labels_\n",
    "\n",
    "#Cluster centers\n",
    "centers = kmeans.cluster_centers_\n",
    "\n",
    "#Assign the X and Y PCA dimensions and cluster labels back to the original dataframe\n",
    "d2 = d.assign(X_red_X=X_red[:,0], X_red_Y=X_red[:,1], cluster_label=lab)\n",
    "\n",
    "#Send the data to a CSV file for use in a spredsheet\n",
    "d2.to_csv(\"USER_DIR\"+fileName+\"_output.csv\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Creat a plot containing the centrality measure - PageRank and Community Detection\n",
    "#Include the out and in degree as the X and Y labels of the plot respectively\n",
    "plt.figure(figsize=(9 ,6))\n",
    "z=d.pagerank*30\n",
    "d.plot.scatter(x='out_degree', y='in_degree', c='community', s=z, colormap='brg', alpha=0.5)\n",
    "\n",
    "plt.xlabel(\"out_degree\")\n",
    "plt.ylabel(\"in_degree\")\n",
    "plt.title(\"Louvain Community Detection\")\n",
    "plt.grid()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Create a scatter plot of the graph embeddings to see how the DEEPWALK algorithm has classified the network components\n",
    "#in this case we should see contextual relationship between Bitcoin addresses and transactions\n",
    "#also there is a possibility this procedure will identify anonomolies occuring on the graph/network\n",
    "plt.figure(figsize=(9 ,6))\n",
    "plt.scatter(X_red[:,0], X_red[:,1], c=lab, s=30, alpha=0.5)\n",
    "\n",
    "plt.xlabel(\"PCA x1\")\n",
    "plt.ylabel(\"PCA x2\")\n",
    "plt.title(\"PCA decomposition from DeepWalk graph embedding \\n + K-means clustering\")\n",
    "plt.rc('font', size=8)\n",
    "\n",
    "plt.scatter(centers[:, 0], centers[:, 1], marker='o',\n",
    "                c=\"white\", alpha=1, s=100, edgecolor='k')\n",
    "for i, c in enumerate(centers):\n",
    "    plt.scatter(c[0], c[1], marker='$%d$' % i, s=50, alpha=1, edgecolor='r')\n",
    "\n",
    "plt.grid()\n",
    "plt.show()"
   ]
  }
 ]
}